#THIS FILE CONTAINS CODE REQUIRED TO REPLICATE MATERIALS IN 
#ARGYLE AND BARBER: MISCLASSIFICATION AND BIAS IN PREDICTIONS OF INDIVIDUAL ETHNICITY FROM ADMINISTRATIVE RECORDS


####################################################
####################################################
#TABLE 1
####################################################
####################################################
nc_file <- read.csv("data/nc_ready_for_analysis.csv")
nc_file <- nc_file[nc_file$sr.race != "Unknown",]

nc_file$sr.race.factor <- as.factor(nc_file$sr.race)

nc_file$predicted.race.party <- ""
nc_file$predicted.race.party[nc_file$predicted.white.party == 1] <- "White"
nc_file$predicted.race.party[nc_file$predicted.black.party == 1] <- "Black"
nc_file$predicted.race.party[nc_file$predicted.latino.party == 1] <- "Hispanic"
nc_file$predicted.race.party[nc_file$predicted.asian.party == 1] <- "Asian"
nc_file$predicted.race.party[nc_file$predicted.other.party == 1] <- "Other"
nc_file <- nc_file[nc_file$predicted.race != "",]

nc_file$pr.race.factor <- as.factor(nc_file$predicted.race.party)

#TABLE 1 - F1-Score
library(yardstick)
f_meas(nc_file, truth = sr.race.factor, estimate = pr.race.factor)
#0.581

#TABLE 1 - overall proportions of races
(table(nc_file$sr.race) / nrow(nc_file))*100

#TABLE 1 - Model with party
#overall error rate
round(mean(nc_file$wrong.race.party, na.rm = T), 3)
#0.154

#false negative
round(mean(nc_file$wrong.race.party[nc_file$sr.race == "White"], na.rm = T), 3)
#0.069
round(mean(nc_file$wrong.race.party[nc_file$sr.race == "Black"], na.rm = T), 3)
#0.341
round(mean(nc_file$wrong.race.party[nc_file$sr.race == "Hispanic"], na.rm = T), 3)
#0.237
round(mean(nc_file$wrong.race.party[nc_file$sr.race == "Asian"], na.rm = T), 3)
#0.340
round(mean(nc_file$wrong.race.party[nc_file$sr.race == "Other"], na.rm = T), 3)
#0.994

#false positive
round(mean(nc_file$predicted.white.party[nc_file$sr.race != "White" & nc_file$sr.race != "Unknown"], na.rm = T), 3)
#0.316
round(mean(nc_file$predicted.black.party[nc_file$sr.race != "Black" & nc_file$sr.race != "Unknown"], na.rm = T), 3)
#0.051
round(mean(nc_file$predicted.latino.party[nc_file$sr.race != "Hispanic" & nc_file$sr.race != "Unknown"], na.rm = T), 3)
#0.015
round(mean(nc_file$predicted.asian.party[nc_file$sr.race != "Asian" & nc_file$sr.race != "Unknown"], na.rm = T), 3)
#0.008
round(mean(nc_file$predicted.other.party[nc_file$sr.race != "Other" & nc_file$sr.race != "Unknown"], na.rm = T), 3)
#0.002

test3 <- nc_file[nc_file$sr.race == "Black",]
test4 <- nc_file[nc_file$sr.race == "Hispanic",]
test5 <- nc_file[nc_file$sr.race == "Asian",]
test6 <- nc_file[nc_file$sr.race == "White",]


####################################################
#FIGURES
####################################################
#INCOME 
#Figure A.8
black.wrong.race <- tapply(X = test3$wrong.race, INDEX = test3$income_rounded, function(x) mean(x, na.rm = T))

point.size <- tapply(X = test3$wrong.race, INDEX = test3$income_rounded, length)
m <- loess(black.wrong.race~names(black.wrong.race), weights = point.size, span = .6)

#7x9 size
#Figure A.8 - (second panel, top row) Black wrong prediction and income
plot(names(black.wrong.race), black.wrong.race, ylim = c(0, 1), xlab = "Median Income of Census Tract ($1,000)", main = "Black Voters (NC)", ylab = "Misclassification Rate", pch = 16, col = "#99999999", axes = F, xlim = c(0, 220000), cex = log(point.size)/5)
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(black.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

############################################
#Figure A.8 - (third panel, top row) Latino wrong prediction and income
latino.wrong.race <- tapply(X = test4$wrong.race, INDEX = test4$income_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test4$wrong.race, INDEX = test4$income_rounded, length)
m <- loess(latino.wrong.race~names(latino.wrong.race), weights = point.size, span = .6)

plot(names(latino.wrong.race), latino.wrong.race, ylim = c(0, 1), xlab = "Median Income of Census Tract ($1,000)", main = "Hispanic Voters (NC)", ylab = "Misclassification Rate", pch = 16, col = "#99999999", axes = F, xlim = c(0, 220000), cex = log(point.size)/5)
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()

############################################
#Figure A.8 - (fourth panel, top row) Asian wrong prediction and income
asian.wrong.race <- tapply(X = test5$wrong.race, INDEX = test5$income_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test5$wrong.race, INDEX = test5$income_rounded, length)
m <- loess(asian.wrong.race~names(asian.wrong.race), weights = point.size, span = .6)

plot(names(asian.wrong.race), asian.wrong.race, ylim = c(0, 1), xlab = "Median Income of Census Tract ($1,000)", main = "Asian Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 220000))
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()

############################################
#Figure A.8 - (first panel, top row) White wrong prediction and income
white.wrong.race <- tapply(X = test6$wrong.race, INDEX = test6$income_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test6$wrong.race, INDEX = test6$income_rounded, length)
m <- loess(white.wrong.race~names(white.wrong.race), weights = point.size, span = .6)

plot(names(white.wrong.race), white.wrong.race, ylim = c(0, 1), xlab = "Median Income of Census Tract ($1,000)", main = "White Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 220000))
axis(side = 1, labels = seq(0, 220, 20), at = seq(0, 220000, 20000), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()


################################################################
## EDUCATIONAL ATTAINMENT ######################################
################################################################

#Figure A.8 (second panel, bottom row) - black wrong prediction and education
black.educ.wrong.race <- tapply(X = test3$wrong.race, INDEX = test3$college_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test3$wrong.race, INDEX = test3$college_rounded, length)
m <- loess(black.educ.wrong.race~names(black.educ.wrong.race), weights = point.size, span = .6)

plot(names(black.educ.wrong.race), black.educ.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract with College Degree", main = "Black Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 100), cex.axis = .9)
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 100, 10))
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.8 - (third panel, bottom row) Latino wrong prediction and education
latino.educ.wrong.race <- tapply(X = test4$wrong.race, INDEX = test4$college_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test4$wrong.race, INDEX = test4$college_rounded, length)
m <- loess(latino.educ.wrong.race~names(latino.educ.wrong.race), weights = point.size, span = .6)

plot(names(latino.educ.wrong.race), latino.educ.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract with College Degree", main = "Hispanic Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 100))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 100, 10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.8 - (fourth panel, bottom row) Asian wrong prediction and education
asian.educ.wrong.race <- tapply(X = test5$wrong.race, INDEX = test5$college_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test5$wrong.race, INDEX = test5$college_rounded, length)
m <- loess(asian.educ.wrong.race~names(asian.educ.wrong.race), weights = point.size, span = .6)

plot(names(asian.educ.wrong.race), asian.educ.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract with College Degree", main = "Asian Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 100))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 100, 10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.8 - (first panel, bottom row) White wrong prediction and education
white.educ.wrong.race <- tapply(X = test6$wrong.race, INDEX = test6$college_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test6$wrong.race, INDEX = test6$college_rounded, length)
m <- loess(white.educ.wrong.race~names(white.educ.wrong.race), weights = point.size, span = .6)

plot(names(white.educ.wrong.race), white.educ.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract with College Degree", main = "White Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 100))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 100, 10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()

#######################################
######### HOME OWNERSHIP###############
#######################################

#Figure A.9 (second panel, top row) - black wrong prediction and home ownership
black.home.wrong.race <- tapply(X = test3$wrong.race, INDEX = test3$homeowner_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test3$wrong.race, INDEX = test3$homeowner_rounded, length)
m <- loess(black.home.wrong.race ~ names(black.home.wrong.race), span = .6, weights = point.size)

plot(names(black.home.wrong.race), black.home.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract who are Homeowners", main = "Black Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 100))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 100, 10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(black.home.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.9 - (third panel, top row) Latino wrong prediction and home ownership
latino.home.wrong.race <- tapply(X = test4$wrong.race, INDEX = test4$homeowner_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test4$wrong.race, INDEX = test4$homeowner_rounded, length)
m <- loess(latino.home.wrong.race ~ names(latino.home.wrong.race), span = .6, weights = point.size)

plot(names(latino.home.wrong.race), latino.home.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract who are Homeowners", main = "Hispanic Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 100))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 100, 10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(latino.home.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.9 - (fourth panel, top row) Asian wrong prediction and home ownership
asian.home.wrong.race <- tapply(X = test5$wrong.race, INDEX = test5$homeowner_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test5$wrong.race, INDEX = test5$homeowner_rounded, length)
m <- loess(asian.home.wrong.race ~ names(asian.home.wrong.race), span = .6, weights = point.size)

plot(names(asian.home.wrong.race), asian.home.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract who are Homeowners", main = "Asian Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 100))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 100, 10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(asian.home.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.9 - (first panel, top row) White wrong prediction and home ownership
white.home.wrong.race <- tapply(X = test6$wrong.race, INDEX = test6$homeowner_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test6$wrong.race, INDEX = test6$homeowner_rounded, length)
m <- loess(white.home.wrong.race ~ names(white.home.wrong.race), span = .6, weights = point.size)

plot(names(white.home.wrong.race), white.home.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract who are Homeowners", main = "White Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 100))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 100, 10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(white.home.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

#########################################################
## VOTE PROPENSITY ######################################
#########################################################

#Figure A.9 (second panel, bottom row) - black wrong prediction and vote propensity
black.vote.wrong.race <- tapply(X = test3$wrong.race, INDEX = test3$vote_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test3$wrong.race, INDEX = test3$vote_rounded, length)
m <- loess(black.vote.wrong.race~names(black.vote.wrong.race), weights = point.size, span = .6)

plot(names(black.vote.wrong.race), black.vote.wrong.race, ylim = c(0, 1), xlab = "Predicted Probability of Voting in 2016", main = "Black Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 1))
axis(side = 1, labels = seq(0, 1, .1), at = seq(0, 1, .1), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(black.vote.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.9 - (third panel, bottom row) Latino wrong prediction and vote propensity
latino.vote.wrong.race <- tapply(X = test4$wrong.race, INDEX = test4$vote_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test4$wrong.race, INDEX = test4$vote_rounded, length)
m <- loess(latino.vote.wrong.race~names(latino.vote.wrong.race), weights = point.size, span = .6)

plot(names(latino.vote.wrong.race), latino.vote.wrong.race, ylim = c(0, 1), xlab = "Predicted Probability of Voting in 2016", main = "Hispanic Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 1))
axis(side = 1, labels = seq(0, 1, .10), at = seq(0, 1, .10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(latino.vote.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.9 - (fourth panel, bottom row) Asian wrong prediction and vote propensity
asian.vote.wrong.race <- tapply(X = test5$wrong.race, INDEX = test5$vote_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test5$wrong.race, INDEX = test5$vote_rounded, length)
m <- loess(asian.vote.wrong.race~names(asian.vote.wrong.race), weights = point.size, span = .6)

plot(names(asian.vote.wrong.race), asian.vote.wrong.race, ylim = c(0, 1), xlab = "Predicted Probability of Voting in 2016", main = "Asian Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 1))
axis(side = 1, labels = seq(0, 1, .10), at = seq(0, 1, .10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(asian.vote.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.9 - (first panel, bottom row) White wrong prediction and vote propensity
white.vote.wrong.race <- tapply(X = test6$wrong.race, INDEX = test6$vote_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test6$wrong.race, INDEX = test6$vote_rounded, length)
m <- loess(white.vote.wrong.race~names(white.vote.wrong.race), weights = point.size, span = .6)

plot(names(white.vote.wrong.race), white.vote.wrong.race, ylim = c(0, 1), xlab = "Predicted Probability of Voting in 2016", main = "White Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 1))
axis(side = 1, labels = seq(0, 1, .10), at = seq(0, 1, .10), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(white.vote.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

##################################
########tract diversity###########
##################################

#Figure A.6 (second panel, bottom row) - black wrong prediction and tract diversity
black.diversity.wrong.race <- tapply(X = test3$wrong.race, INDEX = test3$pct.black.tract_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test3$wrong.race, INDEX = test3$pct.black.tract_rounded, length)
m <- loess(black.diversity.wrong.race ~ names(black.diversity.wrong.race), span = .6, weights = point.size)

plot(names(black.diversity.wrong.race), black.diversity.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract who are Black", main = "Black Voters  (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 1))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 1, .1), cex.axis = .90)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=names(black.diversity.wrong.race), y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.6 -  (third panel, bottom row) Latino wrong prediction and tract diversity
latino.diversity.wrong.race <- tapply(X = test4$wrong.race, INDEX = test4$pct.latino.tract_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test4$wrong.race, INDEX = test4$pct.latino.tract_rounded, length)
m <- loess(latino.diversity.wrong.race ~ names(latino.diversity.wrong.race), span = .6, weights = point.size)

plot(names(latino.diversity.wrong.race), latino.diversity.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract who are Latino", main = "Latino Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 1))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 1, .1))
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.6 -  (fourth panel, bottom row) Asian wrong prediction and tract diversity
asian.diverstiy.wrong.race <- tapply(X = test5$wrong.race, INDEX = test5$pct.asian.tract_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test5$wrong.race, INDEX = test5$pct.asian.tract_rounded, length)
m <- loess(asian.diverstiy.wrong.race ~ names(asian.diverstiy.wrong.race), span = .6, weights = point.size)

plot(names(asian.diverstiy.wrong.race), asian.diverstiy.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract who are Asian", main = "Asian Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 1))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 1, .1))
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()

#Figure A.6 -  (first panel, bottom row) White wrong prediction and tract diversity
white.diversity.wrong.race <- tapply(X = test6$wrong.race, INDEX = test6$pct.white.tract_rounded, function(x) mean(x, na.rm = T))
point.size <- tapply(X = test6$wrong.race, INDEX = test6$pct.white.tract_rounded, length)
m <- loess(white.diversity.wrong.race ~ names(white.diversity.wrong.race), span = .6, weights = point.size)

plot(names(white.diversity.wrong.race), white.diversity.wrong.race, ylim = c(0, 1), xlab = "Percent of Census Tract who are White", main = "White Voters (NC)", ylab = "Misclassification Rate", pch = 16, cex = log(point.size)/5, col = "#99999999", axes = F, xlim = c(0, 1))
axis(side = 1, labels = seq(0, 100, 10), at = seq(0, 1, .1), cex.axis = .9)
axis(side = 2, labels = seq(0, 1, .1), at = seq(0, 1, .1), las = 2)
lines(x=m$x, y=m$fitted, lwd = 4, col = "dark red")
box()
